#' ---
#' title: "Data cleaning"
#' author: "Blinded"
#' ---
#' 
#' # Load packages
#' 
#' 
## ----------------------------------------------------------------------------------------------------------------------------------
rm(list = ls())
library(dplyr)
library(here)
library(logr)
library(data.table)

#' 
#' 
#' # Load and clean data
#' 
#' ## Loading
#' 
## ----------------------------------------------------------------------------------------------------------------------------------
## read in survey data- should work with pathname
## if you open the optimizing schools Rproj in R
log_open(file_name = "code/logs/01_clean_data.log", show_notes = FALSE)

sep("Loading and subsetting data")


raw_data_csv = read.csv(here("data/TESS094 Johnson_FinalData_notract.csv")) 
raw_data_csv = raw_data_csv %>%
        mutate(is_qualified_complete = case_when(QUAL == 1 ~ TRUE,
                                                 TRUE ~ FALSE),
               
               ## randomized if either condition (gen sample) is nonmissing
               ## or if condition_pr (parent sample) is nonmissing
               is_randomized = case_when(!is.na(P_CONDITION)  ~ TRUE, 
                                        TRUE ~ FALSE),
               
               ## has at last one survey weight
               is_weighted = case_when(!is.na(WEIGHT_Genpop) | !is.na(WEIGHT_Parent) ~ TRUE,
                                       TRUE ~ FALSE))

df_analytic_init = raw_data_csv %>%
        filter(is_qualified_complete & is_weighted)

logr::log_print(sprintf("Based on the qualified complete filters, we go from %s unique respondents to %s uniq respondents",
        length(unique((raw_data_csv$CaseId))),
        length(unique((df_analytic_init$CaseId)))))


#' 
#' 
#' ## Step 2: rename vars based on questionnaire
#' 
#' Use derived prefix for all variables we create in the raw data (including rename)
#' 
## ----------------------------------------------------------------------------------------------------------------------------------
sep("Renaming variables")
old_names = c("QS1",
              "Q2",
              "Q3",
              "Q4",
              "Q6",
              "Q5",
              "Q8",
              "Q10A",
              "Q10B",
              "Q10C",
              "Q10D")
new_names = c("raw_parentstatus",
              "derived_compcheck",
              "derived_binary_morefair",
              "derived_binary_freeresp",
              "derived_binary_savetime",
              "derived_continuous_morefair",
              "derived_binary_morefair_poststatus",
              "derived_teachers_input",
              "derived_teachers_bestint",
              "derived_leadership_input",
              "derived_leadership_bestint")

stopifnot(length(old_names) == length(new_names))

## copy dataframe since setnames writes over the names
df_analytic_newnames = df_analytic_init

## use setnames to replace
data.table::setnames(df_analytic_newnames, 
         old_names,
         new_names)


df_toclean <- df_analytic_newnames

sep("Cleaning variables")

#' 
#' 
#' ## Step 3: code variables so that they can be analyzed
#' 
## ----------------------------------------------------------------------------------------------------------------------------------
## code to delete- checking raw values
key_v <- c("derived_binary_morefair",
           "derived_binary_morefair_poststatus",
           "derived_continuous_morefair",
            "derived_teachers_input",
              "derived_teachers_bestint",
              "derived_leadership_input",
              "derived_leadership_bestint")

#' 
#' Setting 98 and 99 to NA, etc; more informative levels
#' 
## ----------------------------------------------------------------------------------------------------------------------------------

likert_breaks = c(1, 2, 3, 4, 5)
likert_cat = c("Strongly disagree", "Disagree", "Agree", "Strongly agree", "Not applicable")

df_toclean = df_toclean %>%
        ## coding selection of response option 2 s more fair
        mutate(is_missing_binaryDV = derived_binary_morefair == 98,
               is_missing_statupdate = derived_binary_morefair_poststatus == 98,
               is_missing_contDV = derived_continuous_morefair == 98,
               derived_continuous_morefair = case_when(derived_continuous_morefair == 98 ~ NA,
                                                       TRUE ~ derived_continuous_morefair),
              derived_alg_morefair = case_when(derived_binary_morefair == 2 ~ TRUE,
                                              derived_binary_morefair == 1 ~ FALSE,
                                                TRUE ~ NA),
              derived_other_morefair = case_when(derived_binary_morefair == 1 ~ TRUE,
                                                derived_binary_morefair == 2 ~ FALSE,
                                                TRUE ~ NA),
              derived_alg_savestime = case_when(derived_binary_savetime == 2 ~ TRUE, 
                                                derived_binary_savetime == 1 ~ FALSE,
                                                TRUE ~ NA),
              
              ## Q8 --- 1 = a predictive model; 2 = the other method post status update
              ## note the flip since hte original question 
              derived_other_morefair_poststatus = case_when(derived_binary_morefair_poststatus == 2 ~ TRUE,
                                                derived_binary_morefair_poststatus == 1 ~ FALSE,
                                                TRUE ~ NA),
              derived_alg_morefair_poststatus = case_when(derived_binary_morefair_poststatus == 1 ~ TRUE,
                                                          derived_binary_morefair_poststatus == 2 ~ FALSE,
                                                          TRUE ~ NA),
               derived_statusquo_cond = case_when(P_CONDITION %in% c(1, 2, 3) ~ "Counselor discretion",
                                                P_CONDITION %in% c(4, 5, 6)  ~ "Parent requests",
                                                P_CONDITION %in% c(7, 8, 9)  ~ "Simple rule",
                                                P_CONDITION %in% c(10, 11, 12)  ~ "Weighted lottery",
                                                TRUE ~ "Other"),
               derived_schoolcomp_cond =  case_when(P_CONDITION %in% c(1, 4, 7, 10)  ~ "Black/Hisp",
                                                P_CONDITION %in% c(2, 5, 8, 11)  ~ "Integrated",
                                                P_CONDITION %in% c(3, 6, 9, 12)  ~ "White",
                                                TRUE ~ "Other"),
               derived_genpop = case_when(!is.na(WEIGHT_Genpop) ~ TRUE, 
                                          TRUE ~ FALSE),
               derived_race = case_when(RACETHNICITY == 1 ~ "White NH",
                                        RACETHNICITY == 2 ~ "Black NH",
                                        RACETHNICITY == 4 ~ "Hispanic",
                                        RACETHNICITY == 6 ~ "Asian NH",
                                        TRUE ~ "2+ or\nOther"),
               derived_min = ifelse(derived_race %in% c("Black NH", "Hispanic"), TRUE, FALSE),
               derived_comb_weight = ifelse(is.na(WEIGHT_Genpop), WEIGHT_Parent,
                                              WEIGHT_Genpop),
               derived_educ_3cat = case_when(EDUC5 %in% c(1, 2) ~ "HS or less",
                                             EDUC5 %in% c(3) ~ "Some college",
                                             EDUC5 >= 4 ~ "College or\nprofessional school"),
               derived_female = GENDER == 2,
               derived_gender_cat = case_when(derived_female ~ "Female", 
                                              GENDER == 1 ~ "Male",
                                              TRUE  ~ "Unknown"),
               derived_pol = case_when(PartyID7 %in% c(1, 2, 3) ~ "Democrat",
                                       PartyID7 %in% c(5, 6, 7)  ~ "Republican",
                                       PartyID7 %in% c(4) ~ "Independent",
                                       TRUE ~ "Unknown"),
               derived_parent = case_when(raw_parentstatus == 1 ~ "Parent (current K-12)",
                                          raw_parentstatus == 2 ~ "Parent (not K-12)",
                                          raw_parentstatus == 3 ~ "Never parent",
                                          TRUE ~ "Other"),
               derived_changeview = derived_other_morefair_poststatus != derived_other_morefair,
              derived_certainty_algfavorable = 
                case_when(derived_continuous_morefair != 98 ~ as.numeric(derived_continuous_morefair), 
                                                         TRUE ~ NA_real_),
              derived_polideo_numeric = case_when(Ideo20 == 8 | is.na(Ideo20) ~ NA_real_,
                                       TRUE ~ as.numeric(Ideo20)),
              derived_polideo_category = factor(case_when(Ideo20 %in% c(1, 2, 3) ~ 
                                                            "Slightly - extremely liberal",
                                                   Ideo20 %in% c(4) ~ "Moderate",
                                                   Ideo20 %in% c(5, 6, 7) ~ 
                                                     "Slightly - extremely conservative", 
                                                   TRUE ~ NA_character_),
                                                levels = c("Slightly - extremely liberal",
                                                           "Moderate",
                                                           "Slightly - extremely conservative"),
                                                ordered = TRUE),
              derived_teachers_input_labels = factor(derived_teachers_input,
                                                     levels = likert_breaks,
                                                     labels = likert_cat),
              derived_teachers_bestint_labels = factor(derived_teachers_bestint,
                                                       levels = likert_breaks,
                                                       labels = likert_cat),
              derived_leaders_input_labels = factor(derived_leadership_input,
                                                       levels = likert_breaks,
                                                       labels = likert_cat),
              derived_leaders_bestint_labels = factor(derived_leadership_bestint,
                                                      levels = likert_breaks,
                                                      labels = likert_cat),
              derived_income = factor(case_when(INCOME4 == 1 ~ "Income: <$30,000",
                                                INCOME4 == 2 ~ "Income: $30-$60,000",
                                                INCOME4 == 3 ~ "Income: $60-$100,000",
                                                INCOME4 == 4 ~ "Income: >$100,000+"),
                                      ordered = TRUE))
              

## filter to ones that have the binary DV 
df_analytic <- df_toclean %>%
          filter(!is_missing_binaryDV)
                                            
## throw error if any respondents missing condition
stopifnot(nrow(df_analytic %>% filter(derived_schoolcomp_cond == "Other")) == 0 &
          nrow(df_analytic %>% filter(derived_statusquo_cond == "Other")) == 0)


## add dx of further lost
log_print(sprintf("Lose %s additional case ids, or %s of the sample, from removing those missing the binary rating DV",
        length(setdiff(df_toclean$CaseId, df_analytic$CaseId)),
        round(length(setdiff(df_toclean$CaseId, df_analytic$CaseId))/nrow(df_toclean), 4)))


## write cleaned data at this stage
write.csv(df_analytic, here("data/analytic_df.csv"),
          row.names = FALSE)

log_print("Wrote cleaned analytic_df.csv")


log_close()

